InĀ [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
1. Read the file¶
InĀ [2]:
#1. ŲØŲ§Ų±ŚÆŲ°Ų§Ų±Ū ŲÆŲ§ŲÆŁāŁŲ§
dff = pd.read_excel('cleardata.xlsx')
InĀ [4]:
dff = dff.rename(columns={'managhe_shahrdari': 'region'})
InĀ [5]:
## ostan tehran
df = dff[dff.Ostan == "Tehran"]
InĀ [6]:
df
Out[6]:
| region | masahat | price | age | eskelet | date | Ostan | Shahrestan | |
|---|---|---|---|---|---|---|---|---|
| 0 | NaN | 60.00 | 5000.00 | 10 | felezi | 1395/01/01 | Tehran | Pakdasht |
| 1 | 14.0 | 70.63 | 35962.06 | 1 | botoni | 1395/01/01 | Tehran | Tehran |
| 3 | 2.0 | 196.16 | 173327.90 | 20 | felezi | 1395/01/01 | Tehran | Tehran |
| 6 | 1.0 | 87.00 | 34482.77 | 1 | botoni | 1395/01/01 | Tehran | Tehran |
| 8 | 6.0 | 108.88 | 60617.19 | 39 | felezi | 1395/01/02 | Tehran | Tehran |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 331843 | 5.0 | 120.00 | 45000.00 | 1 | botoni | 1395/12/30 | Tehran | Tehran |
| 331844 | 10.0 | 84.54 | 33120.42 | 2 | botoni and felezi | 1395/12/30 | Tehran | Tehran |
| 331845 | 16.0 | 47.95 | 21897.81 | 4 | felezi | 1395/12/30 | Tehran | Ray |
| 331855 | 4.0 | 65.00 | 32000.00 | 3 | botoni | 1395/12/30 | Tehran | Tehran |
| 331859 | NaN | 35.00 | 10000.00 | 13 | felezi | 1395/12/30 | Tehran | Quadruple |
184244 rows Ć 8 columns
2.Exploring the data¶
InĀ [7]:
df.describe()
Out[7]:
| region | masahat | price | age | |
|---|---|---|---|---|
| count | 164985.000000 | 1.842440e+05 | 1.842430e+05 | 184244.000000 |
| mean | 8.505428 | 7.399230e+02 | 4.552438e+04 | 7.854139 |
| std | 5.643888 | 2.062575e+05 | 2.650269e+05 | 9.208918 |
| min | 1.000000 | 1.000000e+00 | 1.000000e-02 | 0.000000 |
| 25% | 4.000000 | 6.043000e+01 | 2.375000e+04 | 1.000000 |
| 50% | 7.000000 | 7.677000e+01 | 3.462604e+04 | 5.000000 |
| 75% | 13.000000 | 1.007000e+02 | 5.000000e+04 | 13.000000 |
| max | 22.000000 | 8.840498e+07 | 7.100000e+07 | 1309.000000 |
InĀ [8]:
df.head()
Out[8]:
| region | masahat | price | age | eskelet | date | Ostan | Shahrestan | |
|---|---|---|---|---|---|---|---|---|
| 0 | NaN | 60.00 | 5000.00 | 10 | felezi | 1395/01/01 | Tehran | Pakdasht |
| 1 | 14.0 | 70.63 | 35962.06 | 1 | botoni | 1395/01/01 | Tehran | Tehran |
| 3 | 2.0 | 196.16 | 173327.90 | 20 | felezi | 1395/01/01 | Tehran | Tehran |
| 6 | 1.0 | 87.00 | 34482.77 | 1 | botoni | 1395/01/01 | Tehran | Tehran |
| 8 | 6.0 | 108.88 | 60617.19 | 39 | felezi | 1395/01/02 | Tehran | Tehran |
InĀ [9]:
# Ł
ŲŲ§Ų³ŲØŁ Ś©ŁŲ§ŁŲŖŲ§ŪŁāŁŲ§Ū Ū²Ū° Ł ŪøŪ° ŲÆŲ±ŲµŲÆ
q_20 = df['price'].quantile(0.2)
q_80 = df['price'].quantile(0.8)
# ŁŪŁŲŖŲ±ŪŁŚÆ ŲØŲ± Ų§Ų³Ų§Ų³ Ś©ŁŲ§ŁŲŖŲ§ŪŁāŁŲ§
df = df[(df['price'] >= q_20) & (df['price'] <= q_80)]
InĀ [10]:
# price / 10,000
df['price'] = df['price']%10000
df.head(2)
/tmp/ipykernel_22977/346169701.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['price'] = df['price']%10000
Out[10]:
| region | masahat | price | age | eskelet | date | Ostan | Shahrestan | |
|---|---|---|---|---|---|---|---|---|
| 1 | 14.0 | 70.63 | 5962.06 | 1 | botoni | 1395/01/01 | Tehran | Tehran |
| 6 | 1.0 | 87.00 | 4482.77 | 1 | botoni | 1395/01/01 | Tehran | Tehran |
InĀ [11]:
df['price'] = np.log(df['price'])
df.head(5)
/home/anjel/.local/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /tmp/ipykernel_22977/3109476270.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['price'] = np.log(df['price'])
Out[11]:
| region | masahat | price | age | eskelet | date | Ostan | Shahrestan | |
|---|---|---|---|---|---|---|---|---|
| 1 | 14.0 | 70.63 | 8.693171 | 1 | botoni | 1395/01/01 | Tehran | Tehran |
| 6 | 1.0 | 87.00 | 8.407996 | 1 | botoni | 1395/01/01 | Tehran | Tehran |
| 21 | 18.0 | 64.65 | 7.582952 | 10 | botoni | 1395/01/02 | Tehran | Tehran |
| 25 | 11.0 | 44.02 | 8.514919 | 23 | felezi | 1395/01/03 | Tehran | Tehran |
| 40 | 5.0 | 103.83 | 7.819897 | 3 | botoni | 1395/01/03 | Tehran | Tehran |
InĀ [12]:
df['price'].isin([-np.inf]).sum()
Out[12]:
1773
InĀ [13]:
df = df[~df['price'].isin([-np.inf])]
InĀ [14]:
#check -inf value is clear
df['price'].isin([-np.inf]).sum()
Out[14]:
0
InĀ [15]:
#ŲØŲ±Ų±Ų±Ų³Ū Ų¹ŲÆŲÆ Ū° Ł Ł
ŁŁŪ ŲÆŲ± Ų³ŲŖŁŁ ŁŲ§
count_zero_or_negative1 = (df['masahat'] <= 0).sum()
print(count_zero_or_negative1, 'Ų¹ŲÆŲÆ Ł
ŁŁŪ ŲÆŲ± Ł
Ų³Ų§ŲŲŖ')
count_zero_or_negative2 = (df['price'] <= 0).sum()
print(count_zero_or_negative2, ' : Ų¹ŲÆŲÆ Ł
ŁŁŪ ŲÆŲ± ŪŚ© Ł
ŲŖŲ± Ł
Ų±ŲØŲ¹')
count_zero_or_negative3 = (df['age'] < 0).sum()
print(count_zero_or_negative3, ' : Ų¹ŲÆŲÆ Ł
ŁŁŪ ŲÆŲ± س٠بŁŲ§')
0 Ų¹ŲÆŲÆ Ł ŁŁŪ ŲÆŲ± Ł Ų³Ų§ŲŲŖ 0 : Ų¹ŲÆŲÆ Ł ŁŁŪ ŲÆŲ± ŪŚ© Ł ŲŖŲ± Ł Ų±ŲØŲ¹ 0 : Ų¹ŲÆŲÆ Ł ŁŁŪ ŲÆŲ± س٠بŁŲ§
InĀ [16]:
# value <0 is drop
# ŲŲ°Ł Ų³Ų·Ų±ŁŲ§ŪŪ Ś©Ł Ł
ŁŲÆŲ§Ų± ŪŚ©Ū Ų§Ų² Ų³ŲŖŁŁāŁŲ§ (masahatŲ gheymat_1_metr_morabaŲ age_bana) Ł
ŁŁŪ ŪŲ§ ŲµŁŲ± ŲØŲ§Ų“ŲÆ
df = df[(df['masahat'] > 0) & (df['price'] > 0) & (df['age'] > 0)]
InĀ [17]:
#ŲØŲ±Ų±Ų±Ų³Ū Ų¹ŲÆŲÆ Ū° Ł Ł
ŁŁŪ ŲÆŲ± Ų³ŲŖŁŁ ŁŲ§
count_zero_or_negative1 = (df['masahat'] <= 0).sum()
print(count_zero_or_negative1, 'Ų¹ŲÆŲÆ Ł
ŁŁŪ ŲÆŲ± Ł
Ų³Ų§ŲŲŖ')
count_zero_or_negative2 = (df['price'] <= 0).sum()
print(count_zero_or_negative2, ' ŁŪŁ
ŲŖ :')
count_zero_or_negative3 = (df['age'] < 0).sum()
print(count_zero_or_negative3, ' : Ų¹ŲÆŲÆ Ł
ŁŁŪ ŲÆŲ± س٠بŁŲ§')
0 Ų¹ŲÆŲÆ Ł ŁŁŪ ŲÆŲ± Ł Ų³Ų§ŲŲŖ 0 ŁŪŁ ŲŖ : 0 : Ų¹ŲÆŲÆ Ł ŁŁŪ ŲÆŲ± س٠بŁŲ§
InĀ [18]:
df.isnull().sum()
Out[18]:
region 1476 masahat 0 price 0 age 0 eskelet 0 date 0 Ostan 0 Shahrestan 0 dtype: int64
InĀ [19]:
df.fillna(0, inplace=True)
df.isnull().sum()
Out[19]:
region 0 masahat 0 price 0 age 0 eskelet 0 date 0 Ostan 0 Shahrestan 0 dtype: int64
InĀ [20]:
df.count()
Out[20]:
region 95427 masahat 95427 price 95427 age 95427 eskelet 95427 date 95427 Ostan 95427 Shahrestan 95427 dtype: int64
InĀ [21]:
df.head(2)
Out[21]:
| region | masahat | price | age | eskelet | date | Ostan | Shahrestan | |
|---|---|---|---|---|---|---|---|---|
| 1 | 14.0 | 70.63 | 8.693171 | 1 | botoni | 1395/01/01 | Tehran | Tehran |
| 6 | 1.0 | 87.00 | 8.407996 | 1 | botoni | 1395/01/01 | Tehran | Tehran |
InĀ [22]:
df.shape
Out[22]:
(95427, 8)
InĀ [23]:
px.histogram(df, x='price')
InĀ [24]:
px.histogram(df, x='masahat')
InĀ [25]:
px.histogram(df, x='age')
InĀ [26]:
px.scatter(df, x='price', y='masahat',
labels={"price": "gheymat_yek_metr_moraba bray har saze",
"masahat": "masahat_saze"},
width=350, height=250)
InĀ [27]:
labels = {
'Ostan': 'Ų§Ų³ŲŖŲ§Ł',
'price': 'ŁŪŁ
ŲŖ'
}
# Ų±Ų³Ł
ŁŁ
ŁŲÆŲ§Ų± area ŲØŲ§ Ų§Ų³ŲŖŁŲ§ŲÆŁ Ų§Ų² plotly.express
fig = px.area(df, x='Shahrestan', y='price', color='Shahrestan', line_group='Shahrestan', labels=labels, width=600, height=350)
# ŁŁ
Ų§ŪŲ“ ŁŁ
ŁŲÆŲ§Ų±
fig.show()
InĀ [28]:
from itertools import cycle
labels = {
'Shahrestan': 'Ų“ŁŲ±Ų³ŲŖŲ§Ł',
'price': 'ŁŪŁ
ŲŖ',
'Year': 'Ų³Ų§Ł'
}
# Ų±Ų³Ł
ŁŁ
ŁŲÆŲ§Ų± Ų®Ų·Ū ŲØŲ§ Ų§Ų³ŲŖŁŲ§ŲÆŁ Ų§Ų² plotly.express
fig = px.line(df, x='date', y='price', color='Shahrestan', width=600, height=350, labels=labels)
# ŲŖŁŲøŪŁ
Ų§Ų³ŲŖŲ§ŪŁ Ų®Ų·āŁŲ§Ū ŁŁ
ŁŲÆŲ§Ų±
styles = cycle([None, 'dashdot', 'dash', 'dot'])
for ostan in df['Shahrestan'].unique():
fig.update_traces(selector=dict(name=ostan), line=dict(dash=next(styles)))
fig.update_yaxes(title_text='ŁŪŁ
ŲŖ (ŲŖŁŁ
Ų§Ł)')
fig.update_xaxes(title_text='Ų³Ų§Ł')
# ŁŁ
Ų§ŪŲ“ ŁŁ
ŁŲÆŲ§Ų±
fig.show()
InĀ [29]:
# ŲŖŲ¹Ų±ŪŁ ŲØŲ±ŚŲ³ŲØāŁŲ§ ŲØŲ±Ų§Ū ŁŁ
ŁŲÆŲ§Ų±
df_labels = {
'price': 'ŁŪŁ
ŲŖ ŁŲ±ŁŲ“ (ŲŖŁŁ
Ų§Ł)',
'date': 'ŲŖŲ§Ų±ŪŲ®',
'masahat': 'Ł
Ų³Ų§ŲŲŖ (Ł
ŲŖŲ± Ł
Ų±ŲØŲ¹)'
}
# Ų±Ų³Ł
ŁŁ
ŁŲÆŲ§Ų± Ų®Ų·Ū ŲØŲ§ Ų§Ų³ŲŖŁŲ§ŲÆŁ Ų§Ų² plotly.express
fig = px.line(df, x='date', y='price', title='ŁŪŁ
ŲŖ ŁŲ±ŁŲ“ Ł
ŪŲ§ŁŁ', labels=df_labels, width=500, height=250)
# Ų§ŁŲ²ŁŲÆŁ ŲŲ§Ų“ŪŁāŁŲ§ ŲØŁ ŁŁ
ŁŲÆŲ§Ų±
fig.update_layout(margin=dict(t=30))
# ŁŁ
Ų§ŪŲ“ ŁŁ
ŁŲÆŲ§Ų±
fig.show()
InĀ [30]:
#Ų±Ų³Ł
ŁŁ
ŁŲÆŲ§Ų± scatter ŲØŲ±Ų§Ū ŲÆŁ ŁŪŚŚÆŪ SepalLength Ł SepalWidth
plt.figure(figsize=(8, 6))
plt.scatter(df['masahat'], df['price'], color='blue', alpha=0.7)
plt.title('masaht vs. price')
plt.xlabel('price')
plt.ylabel('masahat')
plt.grid(True)
plt.show()
InĀ [31]:
city_counts = df['Ostan'].value_counts()
print(city_counts)
Ostan Tehran 95427 Name: count, dtype: int64